Data Analysis with multivariate Linear Regression on data about voting for the 2006 and 2010 elections in Brazil for the “Câmara Federal de Deputados”. Data was taken from the TSE portal
eleicoes_data <- readr::read_csv(
here::here('data/eleicoes_2006_e_2010.csv'),
local=readr::locale("br"),
col_types = cols(
ano = col_integer(),
sequencial_candidato = col_character(),
quantidade_doacoes = col_integer(),
quantidade_doadores = col_integer(),
total_receita = col_double(),
media_receita = col_double(),
recursos_de_outros_candidatos.comites = col_double(),
recursos_de_pessoas_fisicas = col_double(),
recursos_de_pessoas_juridicas = col_double(),
recursos_proprios = col_double(),
`recursos_de_partido_politico` = col_double(),
quantidade_despesas = col_integer(),
quantidade_fornecedores = col_integer(),
total_despesa = col_double(),
media_despesa = col_double(),
votos = col_integer(),
.default = col_character()))
# Let's put everything in Upper case for uniformity
eleicoes_data %>%
mutate(nome = toupper(nome),
sexo = toupper(sexo),
grau = toupper(grau),
nome = toupper(nome),
cargo = toupper(cargo),
ocupacao = toupper(ocupacao),
partido = toupper(partido),
estado_civil = toupper(estado_civil),
sequencial_candidato = as.numeric(sequencial_candidato)) -> eleicoes_data
# Adding surrogate key to dataframe
eleicoes_data$id <- 1:nrow(eleicoes_data)
eleicoes_data %>%
glimpse()
## Observations: 7,476
## Variables: 25
## $ ano <int> 2006, 2006, 2006, 2006, ...
## $ sequencial_candidato <dbl> 10001, 10002, 10002, 100...
## $ nome <chr> "JOSÉ LUIZ NOGUEIRA DE S...
## $ uf <chr> "AP", "RO", "AP", "MS", ...
## $ partido <chr> "PT", "PT", "PT", "PRONA...
## $ quantidade_doacoes <int> 6, 13, 17, 6, 48, 6, 14,...
## $ quantidade_doadores <int> 6, 13, 16, 6, 48, 6, 7, ...
## $ total_receita <dbl> 16600.00, 22826.00, 1581...
## $ media_receita <dbl> 2766.67, 1755.85, 9301.2...
## $ recursos_de_outros_candidatos.comites <dbl> 0.00, 6625.00, 2250.00, ...
## $ recursos_de_pessoas_fisicas <dbl> 9000.00, 15000.00, 34150...
## $ recursos_de_pessoas_juridicas <dbl> 6300.00, 1000.00, 62220....
## $ recursos_proprios <dbl> 1300.00, 201.00, 59500.0...
## $ recursos_de_partido_politico <dbl> 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ quantidade_despesas <int> 14, 24, 123, 8, 133, 9, ...
## $ quantidade_fornecedores <int> 14, 23, 108, 8, 120, 9, ...
## $ total_despesa <dbl> 16583.60, 20325.99, 1460...
## $ media_despesa <dbl> 1184.54, 846.92, 1187.09...
## $ cargo <chr> "DEPUTADO FEDERAL", "DEP...
## $ sexo <chr> "MASCULINO", "FEMININO",...
## $ grau <chr> "ENSINO MÉDIO COMPLETO",...
## $ estado_civil <chr> "CASADO(A)", "SOLTEIRO(A...
## $ ocupacao <chr> "VEREADOR", "SERVIDOR PÚ...
## $ votos <int> 8579, 2757, 17428, 1193,...
## $ id <int> 1, 2, 3, 4, 5, 6, 7, 8, ...
eleicoes_data %>%
select(id,
quantidade_despesas,
quantidade_fornecedores,
recursos_de_partido_politico,
recursos_de_pessoas_juridicas,
recursos_de_pessoas_fisicas,
recursos_de_outros_candidatos.comites) %>%
melt(id=c("id")) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 30) +
facet_wrap(. ~ variable,
ncol = 2,
scales = "free_x") +
labs(x="Predictor",y="Absolute Frequency")
eleicoes_data %>%
select(id,
total_receita,
media_receita,
total_despesa,
media_despesa,
recursos_proprios,
quantidade_doacoes,
quantidade_doadores) %>%
melt(id=c("id")) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 30) +
facet_wrap(. ~ variable,
scales = "free_x")
The standard method to deal with a positive skew is to apply a logarithmic transformation to the affected predictor. However, to apply the aforementioned transformation the predictor must not contain any 0.
eleicoes_data %>%
select(quantidade_doacoes,
quantidade_doadores,
total_receita,
media_receita,
recursos_de_outros_candidatos.comites,
recursos_de_pessoas_fisicas,
recursos_de_pessoas_juridicas,
recursos_proprios,
recursos_de_partido_politico,
quantidade_despesas,
quantidade_fornecedores,
total_despesa,
media_despesa) %>%
sapply(., function(x) 0 %in% x) %>%
as.data.frame(row.names = NULL) %>%
tibble::rownames_to_column() %>%
set_colnames(c("predictor","contains_zero")) %>%
arrange(contains_zero)
# apply logarithmic transformation
eleicoes_data %>%
mutate(log.quantidade_doacoes = log(quantidade_doacoes),
log.quantidade_doadores = log(quantidade_doadores),
log.quantidade_despesas = log(quantidade_despesas),
log.quantidade_fornecedores = log(quantidade_fornecedores)) -> eleicoes_data
# put all quantitative predictors (of interest) in same scale
eleicoes_data %>%
mutate_at(.vars = vars(quantidade_doacoes,
quantidade_doadores,
total_receita,
media_receita,
log.quantidade_doacoes,
log.quantidade_doadores,
log.quantidade_despesas,
log.quantidade_fornecedores,
sequencial_candidato,
recursos_de_outros_candidatos.comites,
recursos_de_pessoas_fisicas,
recursos_de_pessoas_juridicas,
recursos_proprios,
recursos_de_partido_politico,
quantidade_despesas,
quantidade_fornecedores,
total_despesa,
media_despesa),
.funs = funs(as.numeric(scale(.)))) -> scaled_data
eleicoes_data %>%
filter(ano == 2006) %>%
group_by(partido) %>%
summarize(n = sum(votos)) %>%
ggplot(aes(reorder(partido,n), n)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90,
hjust = 1)) +
labs(x="Political Party",
title="2006 elections",
y="Number of votes") -> p1
eleicoes_data %>%
filter(ano == 2010) %>%
group_by(partido) %>%
summarize(n = sum(votos)) %>%
ggplot(aes(reorder(partido,n), n)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90,
hjust = 1)) +
labs(x="Political Party",
title="2010 elections",
y="Number of votes") -> p2
grid.arrange(p1, p2, ncol=1)
eleicoes_data %>%
ggplot(aes(total_receita)) +
geom_histogram(bins = 30) +
labs(x="Total Revenue",
y="Absolute Frequency") +
facet_grid(. ~ ano) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
eleicoes_data %>%
ggplot(aes(media_receita)) +
geom_histogram(bins = 30) +
labs(x="Mean Revenue",
y="Absolute Frequency") +
facet_grid(. ~ ano)
eleicoes_data %>%
ggplot(aes(total_despesa)) +
geom_histogram(bins = 30) +
labs(x="Total Expenditure",
y="Absolute Frequency") +
facet_grid(. ~ ano)
eleicoes_data %>%
ggplot(aes(media_despesa)) +
geom_histogram(bins = 30) +
labs(x="Mean Expenditure",
y="Absolute Frequency") +
facet_grid(. ~ ano)
eleicoes_data %>%
ggplot(aes(recursos_proprios)) +
geom_histogram(bins = 30) +
labs(x="Total Revenue",
y="Absolute Frequency") +
facet_grid(. ~ ano)
eleicoes_data %>%
mutate(ano = as.factor(ano)) %>%
group_by(estado_civil, ano) %>%
summarize(n = n()) %>%
ggplot(aes(reorder(estado_civil,n), n,
fill= ano)) +
geom_bar(stat = "identity",
position = position_dodge(width = 0.5)) +
labs(x="Marital status of candidate",
y="Absolute Frequency") +
guides(fill = guide_legend(title = "year")) +
coord_flip()
2010 overall tops 2006 in the number of candidates of each and every marital status with one exception:
eleicoes_data %>%
mutate(ano = as.factor(ano)) %>%
group_by(grau, ano) %>%
summarize(n = n()) %>%
ggplot(aes(reorder(grau,n), n,
fill= ano)) +
geom_bar(stat = "identity",
position = position_dodge(width = 0.5)) +
labs(x="Education level",
y="Absolute Frequency") +
guides(fill = guide_legend(title = "year")) +
coord_flip()
2010 overall tops 2006 in the number of candidates of each and every education level with one exceptions:
eleicoes_data %>%
group_by(sexo, ano) %>%
summarize(n = n()) %>%
ggplot(aes(reorder(sexo,n), n)) +
geom_bar(stat = "identity") +
labs(x="Gender",
y="Absolute Frequency") +
facet_grid(. ~ano)
We’ll use the correlogram to have an idea on how the predictors interact with each and with the target variable. How the predictors react with the target votos is of particular interest.
eleicoes_data %>%
filter(ano == 2006) %>%
select(-partido,
-uf,-nome,
-estado_civil,
-ocupacao,-ano,
-cargo,-grau,-sexo,
-log.quantidade_doacoes,
-log.quantidade_doadores,
-log.quantidade_despesas,
-log.quantidade_fornecedores) %>%
na.omit() %>%
ggcorr(palette = "RdBu",
color = "grey50",
label = TRUE, hjust = 1,
label_size = 3, size = 4,
nbreaks = 5, layout.exp = 7) +
ggtitle("Correlation plot for 2006 elections")
eleicoes_data %>%
filter(ano == 2010) %>%
select(-partido,
-uf,-nome,
-estado_civil,
-ocupacao,-ano,
-cargo,-grau,-sexo,
-log.quantidade_doacoes,
-log.quantidade_doadores,
-log.quantidade_despesas,
-log.quantidade_fornecedores) %>%
na.omit() %>%
ggcorr(palette = "RdBu",
color = "grey50",
label = TRUE, hjust = 1,
label_size = 3, size = 4,
nbreaks = 5, layout.exp = 7) +
ggtitle("Correlation plot for 2010 elections")
eleicoes_data %>%
select(-partido,
-uf,-nome,
-estado_civil,
-ocupacao,-ano,
-cargo,-grau,-sexo,
-log.quantidade_doacoes,
-log.quantidade_doadores,
-log.quantidade_despesas,
-log.quantidade_fornecedores) %>%
na.omit() %>%
ggcorr(palette = "RdBu",
color = "grey50",
label = TRUE, hjust = 1,
label_size = 3, size = 4,
nbreaks = 5, layout.exp = 7) +
ggtitle("Correlation plot for both elections")
For the aforementioned reasons a multivariate linear regression model made with all variables isn’t plausible.
scaled_data %>%
filter(ano == 2006) -> scaled_data_2006
scaled_data_2006 %>%
sample_n(5)
set.seed(11) # We set the set for reason of reproducibility
scaled_data_2006 %>%
dplyr::sample_frac(.5) -> train_data_2006
encoding <- build_encoding(dataSet = train_data_2006,
cols = c("uf","sexo","grau",
"partido","estado_civil"),
verbose = F)
train_data_2006 <- one_hot_encoder(dataSet = train_data_2006,
encoding = encoding,
drop = TRUE,
verbose = F)
cat("#### Train Data ",
"\n##### Observations: ",nrow(train_data_2006),
"\n##### Variables: ",ncol(train_data_2006))
set.seed(11) # We set the set for reason of reproducibility
dplyr::anti_join(scaled_data_2006,
train_data_2006,
by = 'id') -> intermediate_data
intermediate_data %>%
dplyr::sample_frac(.5) -> test_data_2006
test_data_2006 <- one_hot_encoder(dataSet = test_data_2006,
encoding = encoding,
drop = TRUE,
verbose = F)
cat("#### Test Data ",
"\n##### Observations: ",nrow(test_data_2006),
"\n##### Variables: ",ncol(test_data_2006))
set.seed(11) # We set the set for reason of reproducibility
dplyr::anti_join(intermediate_data,
test_data_2006,
by = 'id') -> validate_data_2006
validate_data_2006 <- one_hot_encoder(dataSet = validate_data_2006,
encoding = encoding,
drop = TRUE,
verbose = F)
rm(intermediate_data)
cat("#### Validate Data ",
"\n##### Observations: ",nrow(validate_data_2006),
"\n##### Variables: ",ncol(validate_data_2006))
mod_2006 <- lm(votos ~ quantidade_fornecedores * partido.PSDB + media_despesa * partido.PMDB +
total_receita * total_despesa + uf.SP * estado.civil.CASADO.A. +
uf.RJ * total_despesa + total_receita * `grau.SUPERIOR COMPLETO`,
data = train_data_2006)
broom::glance(mod_2006)
broom::tidy(mod_2006,
conf.int = TRUE,
conf.level = 0.95)
scaled_data %>%
filter(ano == 2010) -> scaled_data_2010
scaled_data_2010 %>%
sample_n(5)
set.seed(11) # We set the set for reason of reproducibility
## Adding surrogate key to dataframe
scaled_data_2010$id <- 1:nrow(scaled_data_2010)
scaled_data_2010 %>%
dplyr::sample_frac(.5) -> train_data_2010
encoding <- build_encoding(dataSet = train_data_2010,
cols = c("uf","sexo","grau",
"partido","estado_civil"),
verbose = F)
train_data_2010 <- one_hot_encoder(dataSet = train_data_2010,
encoding = encoding,
drop = TRUE,
verbose = F)
cat("#### Train Data ",
"\n##### Observations: ",nrow(train_data_2010),
"\n##### Variables: ",ncol(train_data_2010))
set.seed(11) # We set the set for reason of reproducibility
dplyr::anti_join(scaled_data_2010,
train_data_2010,
by = 'id') -> intermediate_data
intermediate_data %>%
dplyr::sample_frac(.5) -> test_data_2010
test_data_2010 <- one_hot_encoder(dataSet = test_data_2010,
encoding = encoding,
drop = TRUE,
verbose = F)
cat("#### Test Data ",
"\n##### Observations: ",nrow(test_data_2010),
"\n##### Variables: ",ncol(test_data_2010))
set.seed(11) # We set the set for reason of reproducibility
dplyr::anti_join(intermediate_data,
test_data_2010,
by = 'id') -> validate_data_2010
validate_data_2010 <- one_hot_encoder(dataSet = validate_data_2010,
encoding = encoding,
drop = TRUE,
verbose = F)
rm(intermediate_data)
cat("#### Validate Data ",
"\n##### Observations: ",nrow(validate_data_2010),
"\n##### Variables: ",ncol(validate_data_2010))
mod_2010 <- lm(votos ~ quantidade_fornecedores * partido.PSDB + media_despesa * partido.PMDB +
total_receita * total_despesa + uf.SP * estado.civil.CASADO.A. +
uf.RJ * total_despesa + total_receita * `grau.SUPERIOR COMPLETO`,
data = train_data_2010)
broom::glance(mod_2010)
broom::tidy(mod_2010,
conf.int = TRUE,
conf.level = 0.95)
broom::tidy(mod_2006,
conf.int = TRUE,
conf.level = 0.95,
sep=":") %>%
arrange(desc(p.value)) %>%
slice(1:3) %>%
ggplot(aes(reorder(term,p.value), p.value)) +
geom_point(size = 2) +
labs(x = "Predictor variable",
y = "Estimated p-value",
title="Underperforming predictors (2006 elections)")
broom::tidy(mod_2010,
conf.int = TRUE,
conf.level = 0.95,
sep=":") %>%
arrange(desc(p.value)) %>%
slice(1:3) %>%
ggplot(aes(reorder(term,p.value), p.value)) +
geom_point(size = 2) +
labs(x = "Predictor variable",
y = "Estimated p-value",
title="Underperforming predictors (2010 elections)")
On both models we can see that predictors related to partido perform poorly. Also we could see the sizable appearance of categorical variable among underperformer predictors.
broom::tidy(mod_2006,
conf.int = TRUE,
conf.level = 0.95,
sep=":") %>%
filter(term != "(Intercept)") %>%
arrange(p.value) %>%
slice(1:3) %>%
ggplot(aes(reorder(term,p.value), p.value)) +
geom_hline(yintercept = 0.05, colour = "darkred") +
geom_point(size = 2) +
labs(x = "Predictor variable",
y = "Estimated p-value",
title="Overperforming predictors (2006 elections)")
broom::tidy(mod_2010,
conf.int = TRUE,
conf.level = 0.95,
sep=":") %>%
filter(term != "(Intercept)") %>%
arrange(p.value) %>%
slice(1:3) %>%
ggplot(aes(reorder(term,p.value), p.value)) +
geom_hline(yintercept = 0.05, colour = "darkred") +
geom_point(size = 2) +
labs(x = "Predictor variable",
y = "Estimated p-value",
title="Overperforming predictors (20010 elections)")
On both models different combinations of the predictors total_receita and total_despesa were clearly the best predictors (those that could explain the votes the most).
mod_2006 %>%
ggplot(aes(.fitted, .resid)) +
geom_point() +
stat_smooth(method="loess") +
geom_hline(col="red",
yintercept=0,
linetype="dashed") +
labs(y="Residuals",
x="Fitted values",
title="Residual vs Fitted Plot (2006 elections)")
mod_2010 %>%
ggplot(aes(.fitted, .resid)) +
geom_point() +
stat_smooth(method="loess") +
geom_hline(col="red",
yintercept=0,
linetype="dashed") +
labs(y="Residuals",
x="Fitted values",
title="Residual vs Fitted Plot (2010 elections)")
mod_2006 %>%
ggplot(aes(sample=rstandard(.))) +
stat_qq(na.rm = TRUE,
shape=1,size=3) + # open circles
labs(title="Normal Q-Q (2006 elections)", # plot title
x="Theoretical Quantiles", # x-axis label
y="Standardized Residuals") + # y-axis label +
geom_abline(color = "red",
size = 0.8,
linetype="dashed") # dashed reference line
mod_2010 %>%
ggplot(aes(sample=rstandard(.))) +
stat_qq(na.rm = TRUE,
shape=1,size=3) + # open circles
labs(title="Normal Q-Q (2010 elections)", # plot title
x="Theoretical Quantiles", # x-axis label
y="Standardized Residuals") + # y-axis label +
geom_abline(color = "red",
size = 0.8,
linetype="dashed") # dashed reference line
mod_2006 %>%
ggplot(aes(.fitted,
sqrt(abs(.stdresid)))) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess",
na.rm = TRUE) +
labs(title = "Scale-Location (2006 elections)",
x= "Fitted Value",
y = expression(sqrt("|Standardized residuals|")))
mod_2010 %>%
ggplot(aes(.fitted,
sqrt(abs(.stdresid)))) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess",
na.rm = TRUE) +
labs(title = "Scale-Location (2010 elections)",
x= "Fitted Value",
y = expression(sqrt("|Standardized residuals|")))
mod_2006 %>%
ggplot(aes(.hat, .stdresid)) +
geom_point(aes(size=.cooksd), na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage")+ylab("Standardized Residuals") +
ggtitle("Residual vs Leverage Plot (2006 elections)") +
scale_size_continuous("Cook's Distance", range=c(1,5)) +
theme(legend.position="bottom")
mod_2010 %>%
ggplot(aes(.hat, .stdresid)) +
geom_point(aes(size=.cooksd), na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage")+ylab("Standardized Residuals") +
ggtitle("Residual vs Leverage Plot (2010 elections)") +
scale_size_continuous("Cook's Distance", range=c(1,5)) +
theme(legend.position="bottom")
mod_2006 %>%
ggplot(aes(.hat, .cooksd)) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage hii")+ylab("Cook's Distance") +
ggtitle("Cook's dist vs Leverage hii/(1-hii) (2006 elections)") +
geom_abline(slope=seq(0,3,0.5), color="gray", linetype="dashed")
mod_2010 %>%
ggplot(aes(.hat, .cooksd)) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage hii")+ylab("Cook's Distance") +
ggtitle("Cook's dist vs Leverage hii/(1-hii) (2010 elections)") +
geom_abline(slope=seq(0,3,0.5), color="gray", linetype="dashed")
predictions <- mod_2006 %>% predict(validate_data_2006)
data.frame( R2 = caret::R2(predictions, validate_data_2006$votos),
RMSE = caret::RMSE(predictions, validate_data_2006$votos),
MAE = caret::MAE(predictions, validate_data_2006$votos),
ERR = caret::RMSE(predictions, validate_data_2006$votos)/
mean(validate_data_2006$votos))
Now let’s talk about the results taken from the validate data for 2006 elections (more meaningful).
The average difference between the observed known outcome values and the values predicted by the model (RMSE) was of approximately 29326.09. + Our model would miss the mark by approximately 29326 (RMSE), that is if candidate had one million votes we would predict 29326 more than we should (or less than we should). The average absolute difference between observed and predicted outcomes (MAE) was approximately 13022.77. * The prediction error rate (ERR) was 1.263851.
predictions <- mod_2010 %>% predict(validate_data_2010)
data.frame( R2 = caret::R2(predictions, validate_data_2010$votos),
RMSE = caret::RMSE(predictions, validate_data_2010$votos),
MAE = caret::MAE(predictions, validate_data_2010$votos),
ERR = caret::RMSE(predictions, validate_data_2010$votos)/
mean(validate_data_2010$votos))
Now let’s talk about the results taken from the validate data for 2010 elections (more meaningful).
The average difference between the observed known outcome values and the values predicted by the model (RMSE) was of approximately 30501.16 + Our model would miss the mark by approximately 30501 (RMSE), that is if candidate had one million votes we would predict 30501 more than we should (or less than we should). The average absolute difference between observed and predicted outcomes (MAE) was approximately 13789.98. * The prediction error rate (ERR) was 1.409471.
We have signs here that the model for 2006 elections fit the train set a little too much (better residual results but worse validation results).
predictions <- mod_2006 %>% predict(test_data_2006)
data.frame( R2 = caret::R2(predictions, test_data_2006$votos),
RMSE = caret::RMSE(predictions, test_data_2006$votos),
MAE = caret::MAE(predictions, test_data_2006$votos),
ERR = caret::RMSE(predictions, test_data_2006$votos)/
mean(test_data_2006$votos))
Now let’s talk about the results taken from the test data for 2006 elections (most meaningful).
The average difference between the observed known outcome values and the values predicted by the model (RMSE) was of approximately 31605.85 + Our model would miss the mark by approximately 31606 (RMSE), that is if candidate had one million votes we would predict 31606 more than we should (or less than we should). The average absolute difference between observed and predicted outcomes (MAE) was approximately 13832.72. * The prediction error rate (ERR) was 1.350674.
predictions <- mod_2010 %>% predict(test_data_2010)
data.frame( R2 = caret::R2(predictions, test_data_2010$votos),
RMSE = caret::RMSE(predictions, test_data_2010$votos),
MAE = caret::MAE(predictions, test_data_2010$votos),
ERR = caret::RMSE(predictions, test_data_2010$votos)/
mean(test_data_2010$votos))
Now let’s talk about the results taken from the test data for 2006 elections (most meaningful).
The average difference between the observed known outcome values and the values predicted by the model (RMSE) was of approximately 29033.86 + Our model would miss the mark by approximately 29034 (RMSE), that is if candidate had one million votes we would predict 29034 more than we should (or less than we should). The average absolute difference between observed and predicted outcomes (MAE) was approximately 12804.09. * The prediction error rate (ERR) was 1.461269.
At the end the model proved a better fit (a little better) for the 2010 elections than for the 2006 elections.
total_receita and total_despesa were clearly the best predictors (those that could explain the votes the most), the rest of the predictors, such as partido, estado.civil and quantidade_fornecedores were redundant. For this reason we’ll try to create a model using only the best predictors.
mod_2006 <- lm(votos ~ total_receita * total_despesa,
data = train_data_2006)
broom::glance(mod_2006)
mod_2006 %>%
ggplot(aes(.fitted, .resid)) +
geom_point() +
stat_smooth(method="loess") +
geom_hline(col="red",
yintercept=0,
linetype="dashed") +
labs(y="Residuals",
x="Fitted values",
title="Residual vs Fitted Plot (2006 elections)")
mod_2006 %>%
ggplot(aes(sample=rstandard(.))) +
stat_qq(na.rm = TRUE,
shape=1,size=3) + # open circles
labs(title="Normal Q-Q (2006 elections)", # plot title
x="Theoretical Quantiles", # x-axis label
y="Standardized Residuals") + # y-axis label +
geom_abline(color = "red",
size = 0.8,
linetype="dashed") # dashed reference line
mod_2006 %>%
ggplot(aes(.fitted,
sqrt(abs(.stdresid)))) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess",
na.rm = TRUE) +
labs(title = "Scale-Location (2006 elections)",
x= "Fitted Value",
y = expression(sqrt("|Standardized residuals|")))
mod_2006 %>%
ggplot(aes(.hat, .stdresid)) +
geom_point(aes(size=.cooksd), na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage")+ylab("Standardized Residuals") +
ggtitle("Residual vs Leverage Plot (2006 elections)") +
scale_size_continuous("Cook's Distance", range=c(1,5)) +
theme(legend.position="bottom")
mod_2006 %>%
ggplot(aes(.hat, .cooksd)) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage hii")+ylab("Cook's Distance") +
ggtitle("Cook's dist vs Leverage hii/(1-hii) (2006 elections)") +
geom_abline(slope=seq(0,3,0.5), color="gray", linetype="dashed")
predictions <- mod_2006 %>% predict(validate_data_2006)
data.frame( R2 = caret::R2(predictions, validate_data_2006$votos),
RMSE = caret::RMSE(predictions, validate_data_2006$votos),
MAE = caret::MAE(predictions, validate_data_2006$votos),
ERR = caret::RMSE(predictions, validate_data_2006$votos)/
mean(validate_data_2006$votos))
Now let’s talk about the results taken from the validate data (more meaningful).
predictions <- mod_2006 %>% predict(test_data_2006)
data.frame( R2 = caret::R2(predictions, test_data_2006$votos),
RMSE = caret::RMSE(predictions, test_data_2006$votos),
MAE = caret::MAE(predictions, test_data_2006$votos),
ERR = caret::RMSE(predictions, test_data_2006$votos)/
mean(test_data_2006$votos))
Now let’s talk about the results taken from the validate data (more meaningful).
Our skimmed model fared worse on validation but had a slightly better result in test than the original model.
mod_2010 <- lm(votos ~ total_receita * total_despesa,
data = train_data_2010)
broom::glance(mod_2010)
mod_2010 %>%
ggplot(aes(.fitted, .resid)) +
geom_point() +
stat_smooth(method="loess") +
geom_hline(col="red",
yintercept=0,
linetype="dashed") +
labs(y="Residuals",
x="Fitted values",
title="Residual vs Fitted Plot (2010 elections)")
mod_2010 %>%
ggplot(aes(sample=rstandard(.))) +
stat_qq(na.rm = TRUE,
shape=1,size=3) + # open circles
labs(title="Normal Q-Q (2010 elections)", # plot title
x="Theoretical Quantiles", # x-axis label
y="Standardized Residuals") + # y-axis label +
geom_abline(color = "red",
size = 0.8,
linetype="dashed") # dashed reference line
mod_2010 %>%
ggplot(aes(.fitted,
sqrt(abs(.stdresid)))) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess",
na.rm = TRUE) +
labs(title = "Scale-Location (2010 elections)",
x= "Fitted Value",
y = expression(sqrt("|Standardized residuals|")))
mod_2010 %>%
ggplot(aes(.hat, .stdresid)) +
geom_point(aes(size=.cooksd), na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage")+ylab("Standardized Residuals") +
ggtitle("Residual vs Leverage Plot (2010 elections)") +
scale_size_continuous("Cook's Distance", range=c(1,5)) +
theme(legend.position="bottom")
mod_2010 %>%
ggplot(aes(.hat, .cooksd)) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage hii")+ylab("Cook's Distance") +
ggtitle("Cook's dist vs Leverage hii/(1-hii) (2010 elections)") +
geom_abline(slope=seq(0,3,0.5), color="gray", linetype="dashed")
predictions <- mod_2010 %>% predict(validate_data_2010)
data.frame( R2 = caret::R2(predictions, validate_data_2010$votos),
RMSE = caret::RMSE(predictions, validate_data_2010$votos),
MAE = caret::MAE(predictions, validate_data_2010$votos),
ERR = caret::RMSE(predictions, validate_data_2010$votos)/
mean(validate_data_2010$votos))
Now let’s talk about the results taken from the validate data (more meaningful).
predictions <- mod_2010 %>% predict(test_data_2010)
data.frame( R2 = caret::R2(predictions, test_data_2010$votos),
RMSE = caret::RMSE(predictions, test_data_2010$votos),
MAE = caret::MAE(predictions, test_data_2010$votos),
ERR = caret::RMSE(predictions, test_data_2010$votos)/
mean(test_data_2010$votos))
Now let’s talk about the results taken from the test data (most meaningful).
We got a decent 0.57 R² and adjusted R² approximately (notice the decrease). This means that this model explain approximately 57% of the response variable variability.
The prediction error rate (ERR) was 1.380093.
set.seed(11) # We set the set for reason of reproducibility
## Adding surrogate key to dataframe
scaled_data$id <- 1:nrow(scaled_data)
scaled_data %>%
dplyr::sample_frac(.5) -> train_data
encoding <- build_encoding(dataSet = train_data,
cols = c("uf","sexo","grau",
"partido","estado_civil"),
verbose = F)
train_data <- one_hot_encoder(dataSet = train_data,
encoding = encoding,
drop = TRUE,
verbose = F)
cat("#### Train Data ",
"\n##### Observations: ",nrow(train_data),
"\n##### Variables: ",ncol(train_data))
set.seed(11) # We set the set for reason of reproducibility
dplyr::anti_join(scaled_data,
train_data,
by = 'id') -> intermediate_data
intermediate_data %>%
dplyr::sample_frac(.5) -> test_data
test_data <- one_hot_encoder(dataSet = test_data,
encoding = encoding,
drop = TRUE,
verbose = F)
cat("#### Test Data ",
"\n##### Observations: ",nrow(test_data),
"\n##### Variables: ",ncol(test_data))
set.seed(11) # We set the set for reason of reproducibility
dplyr::anti_join(intermediate_data,
test_data,
by = 'id') -> validate_data
validate_data <- one_hot_encoder(dataSet = validate_data,
encoding = encoding,
drop = TRUE,
verbose = F)
rm(intermediate_data)
cat("#### Validate Data ",
"\n##### Observations: ",nrow(validate_data),
"\n##### Variables: ",ncol(validate_data))
mod <- lm(votos ~ total_receita * total_despesa,
data = train_data)
broom::glance(mod)
mod %>%
ggplot(aes(.fitted, .resid)) +
geom_point() +
stat_smooth(method="loess") +
geom_hline(col="red",
yintercept=0,
linetype="dashed") +
labs(y="Residuals",
x="Fitted values",
title="Residual vs Fitted Plot")
mod %>%
ggplot(aes(sample=rstandard(.))) +
stat_qq(na.rm = TRUE,
shape=1,size=3) + # open circles
labs(title="Normal Q-Q", # plot title
x="Theoretical Quantiles", # x-axis label
y="Standardized Residuals") + # y-axis label +
geom_abline(color = "red",
size = 0.8,
linetype="dashed") # dashed reference line
Our residuals deviate a considerable deal from a normal distribution.
mod %>%
ggplot(aes(.fitted,
sqrt(abs(.stdresid)))) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess",
na.rm = TRUE) +
labs(title = "Scale-Location",
x= "Fitted Value",
y = expression(sqrt("|Standardized residuals|")))
mod %>%
ggplot(aes(.hat, .stdresid)) +
geom_point(aes(size=.cooksd), na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage")+ylab("Standardized Residuals") +
ggtitle("Residual vs Leverage Plot") +
scale_size_continuous("Cook's Distance", range=c(1,5)) +
theme(legend.position="bottom")
mod %>%
ggplot(aes(.hat, .cooksd)) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage hii")+ylab("Cook's Distance") +
ggtitle("Cook's dist vs Leverage hii/(1-hii)") +
geom_abline(slope=seq(0,3,0.5), color="gray", linetype="dashed")
predictions <- mod %>% predict(validate_data)
data.frame( R2 = caret::R2(predictions, validate_data$votos),
RMSE = caret::RMSE(predictions, validate_data$votos),
MAE = caret::MAE(predictions, validate_data$votos),
ERR = caret::RMSE(predictions, validate_data$votos)/
mean(validate_data$votos))
Now let’s talk about the results taken from the validate data (more meaningful).
predictions <- mod %>% predict(test_data)
data.frame( R2 = caret::R2(predictions, test_data$votos),
RMSE = caret::RMSE(predictions, test_data$votos),
MAE = caret::MAE(predictions, test_data$votos),
ERR = caret::RMSE(predictions, test_data$votos)/
mean(test_data$votos))
Now let’s talk about the results taken from the test data (most meaningful).
In the model for both elections we could see signs of the model stuggling to fit the data similar to the 2006 elections model but way more preoccupying. At the stage of cross validation our expectations were met, and the model for both elections fared worse than the rest, especially in terms of test data.